library(survey)
library(mice)
library(dplyr)
library(tidyr)
library(haven)

set.seed(123)

rm(list=ls())

## define the subdirectories
in_dir <- "./data"
aux_dir <- "./misc_inputs"
out_dir <- "./output"

## set constants
n_imp <- 20L  ## number of imputations

countries <- c("CZ","DE","ES","FR","HU","IT","NL","RO","SE","GB")
names(countries) <- countries

### aux functions
### raking
az_rake <- function(x, id, by, on, wf.ds, pop.mg) {
  dli <- split(x, x[[by]])
  over <- names(dli)
  weights <- lapply(over, function(cn) {
    wd <- subset(wf.ds, wf.ds[[by]]==cn & wf.ds[[id]] %in% dli[[cn]][[id]])
    wt.template <- lapply(on, function(j) { 
      ini <- pop.mg[[cn]][[j]]
      ini <- ini[ini[[j]]!="UNK" & ini[[j]] %in% wd[[j]],]
      unk <- mean(wd[[j]]=="UNK")
      unk.df <- data.frame(k="UNK",value=unk)
      colnames(unk.df)[1] <- j
      ini$value <- (1-unk)*ini$value/sum(ini$value)
      if (unk>0) ini <- rbind(ini, unk.df)
      ini
    })
    sur <- survey::svydesign(ids=as.formula(paste0('~',id)), probs= ~1, data= wd)
    sur.r <- survey::rake(sur, sample.margins = lapply(paste0("~",on), as.formula), 
                          population.margins = wt.template, 
                          control = list(maxit = 200))
    wt <- stats::weights(sur.r)
    wt <- wt/mean(wt)
    wt[wt>3] <- 3
    s <- data.frame(row=sur.r$cluster[[id]], wt=wt/mean(wt))
    merge(x, s, by=id)
  })
  weights$make.row.names <- FALSE
  do.call('rbind', weights)
}

## load data
dt <- read_sav(file.path(in_dir, "PRECEDE3_survey.sav"))
pdata <- read_sav(file.path(in_dir, "PEPS_plus.sav"))
bmks <- readRDS(file.path(aux_dir, "Census benchmarks.rds"))
pol.mod <- openxlsx::read.xlsx(file.path(aux_dir, "manifest_variables.xlsx")) |> na.omit()
wf.recall <- openxlsx::read.xlsx(file.path(aux_dir, "vote_distribution.xlsx"), sheet="margins")

## manifest variables for the measurement models
target <- with(pol.mod, setNames(vnam, vabbr))
reverse <- subset(pol.mod, reverse=='Y', select="vabbr", drop=TRUE)

## exclude regions with strong regional parties
vbulk <- subset(dt, 
             cntry %in% countries & 
               !wf.region %in% c('UKL','UKM','UKN',
                                 'ES11','ES13','ES21','ES22','ES24',
                                 'ES51','ES52','ES64','ES70')) |>
  mutate(
    female = ifelse(wf.gender=='F',1,0),
    age_under30 = ifelse(wf.age=='lt30',1,0),
    age_over50 = ifelse(wf.age=='ge50',1,0),
    university = ifelse(wf.edu=='T',1,0)
    ) |> select(cntry, row, female, university, age_under30, age_over50, 
                cntry_party_ivc, all_of(target)) |> 
## reverse the scales of manifest variables as necessary
  mutate(across(all_of(reverse), ~ 6L-.x)) 

## remove cases when more than 20% of manifest variable values are missing 
pol_nmi <- apply(is.na(vbulk[,names(target)]), 1L, mean) 
sam0 <- vbulk |> filter(pol_nmi<0.2) |> 
  mutate(cntry_party_ivc = coalesce(cntry_party_ivc, 'UNK'))

## add vote distributions to the raking benchmarks
for (cn in countries) {
  bmks[[cn]][['wf.recall']] <- wf.recall |> filter(cntry==cn) |>
    select(wf.recall, value)
} 

## compute weights
wei.vars <- c("wf.gender","wf.age","wf.edu","wf.region",'wf.recall' )
wei.factors <- dt |>
  select(row, cntry, all_of(wei.vars)) |>
  mutate(wf.recall)
sam <- az_rake(sam0, id='row', by='cntry', on=wei.vars, wf.ds=wei.factors, pop.mg=bmks) 
sams <- split(sam, sam$cntry)

## save data that should not be imputed
vbg <- lapply(sams, function(u) {
  for (v in names(target)) {
    u[[v]] <- NULL
  }
  u
})

## multiple imputations
vbulk.i <- lapply(sams, function(u) { 
  w <- u
  w$cntry <- w$row <- w$wt <-NULL 
  for (v in names(target)) {
    w[[v]] <- ordered(w[[v]], levels=1:5) 
  }
  mice::mice(w, m=n_imp, seed=123, method='rf')
}) 

### party-level information 
pbulk <- pdata |> 
  select(cntry, row, cntry_party, all_of(target)) |>
  mutate(across(all_of(reverse), ~ 6L-.x)) |>
  mutate(across(all_of(names(target)), ~ordered(.x, levels=1:5)))

pbg <- pdata |> 
  select(cntry, cntry_party, popul, party, ss, poppa_populism) |>
  distinct()

## save data
save(list=c("vbulk.i","pbulk"), file= file.path(in_dir, "ready_for_CFA.RData"))
save(list=c("vbg","pbg"), file= file.path(in_dir, "additional_data.RData"))
